Part 1: Select a Dataset¶
For this assignment, I selected the Philadelphia Shooting Victims dataset from OpenDataPhilly.
Reasons for choosing this dataset:
- It contains timestamped entries (date and time of incidents).
- It includes demographic information such as age, sex, and race.
- It has categorical outcomes (fatal vs non-fatal).
- The dataset size is moderate, which makes it suitable for analysis in Jupyter.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from pathlib import Path
df = pd.read_csv("Assignment_2 Yanyang_Chen/data/shootings.csv")
df["occur_dt"] = pd.to_datetime(df["date_"] + " " + df["time"], errors="coerce")
df["year"] = df["occur_dt"].dt.year
df["month"] = df["occur_dt"].dt.to_period("M").dt.to_timestamp()
df["dow"] = df["occur_dt"].dt.day_name()
df["hour"] = df["occur_dt"].dt.hour
df_small = df[["occur_dt","year","month","dow","hour","age","sex","race","fatal"]].copy()
df_small.head()
| occur_dt | year | month | dow | hour | age | sex | race | fatal | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-02-06 12:17:00 | 2015.0 | 2015-02-01 | Friday | 12.0 | 42.0 | M | B | 0.0 |
| 1 | 2015-03-19 15:40:00 | 2015.0 | 2015-03-01 | Thursday | 15.0 | 19.0 | M | B | 0.0 |
| 2 | 2015-03-29 05:10:00 | 2015.0 | 2015-03-01 | Sunday | 5.0 | 23.0 | M | B | 0.0 |
| 3 | 2015-05-05 21:20:00 | 2015.0 | 2015-05-01 | Tuesday | 21.0 | 40.0 | M | B | 0.0 |
| 4 | 2015-05-05 21:20:00 | 2015.0 | 2015-05-01 | Tuesday | 21.0 | 28.0 | M | B | 0.0 |
# 按月份统计枪击人数
monthly = df_small.groupby("month")["occur_dt"].count().reset_index(name="count")
# 计算 3 个月滚动平均
monthly["roll_3m"] = monthly["count"].rolling(3, min_periods=1).mean()
# 绘制折线图
plt.figure(figsize=(10,5))
plt.plot(monthly["month"], monthly["count"], label="Monthly count", color="steelblue")
plt.plot(monthly["month"], monthly["roll_3m"], label="3-month rolling avg", linestyle="--", color="darkorange")
plt.title("Monthly Trend of Shooting Victims")
plt.xlabel("Month")
plt.ylabel("Number of Victims")
plt.grid(alpha=0.3)
plt.legend()
plt.show()
This line chart shows the monthly trend of shooting victims. The raw monthly counts fluctuate, but the 3-month rolling average reveals a smoother long-term pattern. Matplotlib was chosen for this chart because it provides precise control over line styles, colors, and annotations, which is ideal for time series visualization.
# Generate Pivot Table: Day of the Week × Hour
heat = df_small.pivot_table(
index="dow",
columns="hour",
values="occur_dt",
aggfunc="count"
).reindex(
["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)
# Heatmap Generation
plt.figure(figsize=(12,4))
sns.heatmap(heat, cmap="YlOrRd")
plt.title("Heatmap of Shooting Victims by Day of Week × Hour", fontsize=14) # 标题字体
plt.xlabel("Hour of Day", fontsize=12)
plt.ylabel("Day of Week", fontsize=12)
# Adjust the font size of the scale markings
plt.xticks(fontsize=8)
plt.yticks(fontsize=10)
plt.show()
This heatmap shows the temporal distribution of shootings by day of week and hour. It highlights that shootings often occur in the evening, especially on weekends. Seaborn was chosen because it simplifies the creation of statistically informative heatmaps with good default aesthetics.
alt.data_transformers.disable_max_rows() # 允许绘制大数据集
alt_a = (
alt.Chart(df_small)
.transform_timeunit(ym="yearmonth(occur_dt)") # 转换为年-月
.transform_aggregate(count="count()", groupby=["ym"]) # 按月计数
.mark_bar()
.encode(
x="ym:T",
y="count:Q",
tooltip=["ym:T", "count:Q"]
)
.properties(title="Victims per Month (Altair Aggregate)")
)
alt_a
alt.Chart(df_small).mark_bar().encode(
x=alt.X("age:Q", bin=alt.Bin(step=1), title="Age (1-year bins)"), # 每1岁一个bin
y=alt.Y("count()", title="Count"),
tooltip=["count()"]
).properties(
title="Histogram of Victim Age (1-year bins)",
width=400,
height=300
)
Histogram of Victim Age (1-year bins)¶
This histogram shows the distribution of victim ages, with each bar representing one year.
From the chart, we can observe that:
- The majority of victims are concentrated in the 20–30 age range, where the bars are the highest.
- The number of victims gradually decreases in the 30–40 age range.
- Victims over 40 years old show a sharp decline, forming a long tail.
- Very few victims are under the age of 10, almost negligible.
Conclusion:
Most shooting victims are young adults, especially those between 20–30 years old. This suggests that young people are at the highest risk, likely due to more frequent social activity and higher exposure to dangerous situations. Preventive measures should therefore pay special attention to this age group.
brush = alt.selection_interval(encodings=["y"]) # 刷选在纵轴上
alt_c = (
alt.Chart(df_small)
.transform_aggregate(count="count()", groupby=["dow"])
.mark_bar()
.encode(
y=alt.Y(
"dow:N",
sort=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
title="Day of Week"
),
x=alt.X("count:Q", title="Number of Victims"),
color=alt.condition(
brush,
alt.Color("count:Q", scale=alt.Scale(scheme="blues")), # 渐变蓝色
alt.value("lightgray") # 未选中的灰色
),
tooltip=["dow:N", "count:Q"]
)
.add_params(brush)
.properties(
title="Brush to Highlight Days",
width=450,
height=300
)
)
alt_c
Brush to Highlight Days¶
This bar chart displays the distribution of shooting incidents by day of the week.
From the visualization, we can see:
- Sunday has the highest number of incidents, followed closely by Saturday and Monday.
- Friday has the lowest count compared to other weekdays.
- Overall, incidents are relatively consistent across the week, but weekends show slightly higher numbers.
Conclusion:
Shootings occur on all days of the week, but weekends (especially Sunday and Saturday) see more cases. This trend may be linked to increased social gatherings and outdoor activities during weekends, which elevate the risk of violent incidents.
import altair as alt
import pandas as pd
# ——前置:确保 occur_dt / dow 存在——
if "occur_dt" not in df.columns:
df["occur_dt"] = pd.to_datetime(df["date_"] + " " + df["time"], errors="coerce")
df["dow"] = df["occur_dt"].dt.day_name()
dow_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
alt.data_transformers.disable_max_rows()
# ——配色:上图浅蓝;下图渐变“浅→中→深”,且中点与上图一致(steelblue)——
LIGHT = "#7FB3D5" # 低值:浅蓝(上图颜色)
MID = "#4682B4" # 中值:steelblue(与上图呼应)
DARK = "#1B4F72" # 高值:深蓝
# 颜色映射范围(根据全局 weekday 计数,给低/中/高三个锚点)
_counts = df.loc[df["dow"].isin(dow_order), "dow"].value_counts()
low, high = int(_counts.min()), int(_counts.max())
mid = (low + high) / 2
# ——刷选:基于 yearmonth 的 X 轴区间——
time_brush = alt.selection_interval(encodings=["x"])
# 上图:时间序列(月聚合,可刷选)
timeline = (
alt.Chart(df)
.transform_timeunit(ym="yearmonth(occur_dt)")
.transform_aggregate(count="count()", groupby=["ym"])
.mark_area(opacity=0.7, color=LIGHT) # 用浅蓝
.encode(
x=alt.X("ym:T", title="Year–Month"),
y=alt.Y("count:Q", title="Victim count"),
tooltip=["ym:T","count:Q"]
)
.add_params(time_brush)
.properties(width=720, height=140, title="Brush a Time Range")
)
# 下图:星期分布(随刷选过滤;渐变“浅→中→深”,高值更深,带图例)
by_dow = (
alt.Chart(df)
.transform_timeunit(ym="yearmonth(occur_dt)") # 下图也生成 ym 才能接收刷选
.transform_filter(time_brush)
.transform_filter(alt.FieldOneOfPredicate(field="dow", oneOf=dow_order)) # 过滤掉空/异常
.transform_aggregate(count="count()", groupby=["dow"])
.mark_bar(stroke="#27496D", strokeWidth=0.4)
.encode(
y=alt.Y("dow:N", sort=dow_order, title="Day of Week"),
x=alt.X("count:Q", title="Number of Victims"),
color=alt.Color(
"count:Q",
scale=alt.Scale(
domain=[low, mid, high], # 低 / 中 / 高
range=[LIGHT, MID, DARK], # 浅 → 中 → 深(高值更深)
clamp=True
),
legend=alt.Legend(
title="Victims (count)",
orient="right",
labelFontSize=11,
titleFontSize=12,
gradientLength=160
)
),
tooltip=["dow:N","count:Q"]
)
.properties(width=720, height=210, title="Day of Week within Brushed Range")
)
(timeline & by_dow)